knitr::opts_chunk$set(echo = TRUE)
price_data <- read.delim("prices-and-earnings.txt")
price_data <- price_data[,c(1,2,5,6,7,9,10,16,17,18,19)]
rownames(price_data) <- price_data$City
price_data_scale <- scale(price_data[,-1])
rownames(price_data_scale) <- price_data$City
plot_ly(x=colnames(price_data_scale), y=rownames(price_data_scale),
z=price_data_scale, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset")
Analysis: No its not possible to see clusters, very hard to even see an outlier.
price_row_dist <- dist(x=price_data_scale, method = "euclidean", diag = TRUE)
price_col_dist <- dist(x=t(price_data_scale), method = "euclidean", diag = TRUE)
price_row_cor <- cor(x=t(price_data_scale))
price_col_cor <- cor(x=price_data_scale)
price_row_cor <- 1- price_row_cor
price_col_cor <- 1- price_col_cor
order1 <- seriate(price_row_dist, "OLO")
order2 <- seriate(price_col_dist, "OLO")
ord1 <- get_order(order1)
ord2 <- get_order(order2)
order1_cor <- seriate(as.dist(price_row_cor), "OLO")
order2_cor <- seriate(as.dist(price_col_cor), "OLO")
ord1_cor <- get_order(order1_cor)
ord2_cor <- get_order(order2_cor)
reordmatr <- price_data_scale[rev(ord1),ord2]
reordmatr_cor <- price_data_scale[rev(ord1_cor),ord2_cor]
plot_ly(x=colnames(reordmatr), y=rownames(reordmatr),
z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Optimal leaf ordering")
plot_ly(x=colnames(reordmatr_cor), y=rownames(reordmatr_cor),
z=reordmatr_cor, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of correlations of price dataset")
Analysis: Both are graphs are similar to each other. We can see a cluster in the digonals of both the plots.
order1_tsp <- seriate(price_row_dist, "TSP")
order2_tsp <- seriate(price_col_dist, "TSP")
ord1_tsp <- get_order(order1_tsp)
ord2_tsp <- get_order(order2_tsp)
reordmatr <- price_data_scale[rev(ord1_tsp),ord2_tsp]
plot_ly(x=colnames(reordmatr), y=rownames(reordmatr),
z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Traveling Sales Man")
## Comparing the optimization
criterion(dist(price_data_scale[rev(ord1_tsp)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
## Gradient_raw Path_length
## 34671.00000 33.23556
criterion(dist(price_data_scale[rev(ord1)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
## Gradient_raw Path_length
## 30318.00000 32.03208
Analysis: The ‘Travelling Sales Man’ approach performs worse than ‘Hierarchical clustering’ in terms of Gradient raw but has a better path length.
setnames(price_data, old = c("Food.Costs...", "iPhone.4S.hr.", "Clothing.Index", "Hours.Worked",
"Wage.Net", "Vacation.Days", "Big.Mac.min.", "Bread.kg.in.min.",
"Rice.kg.in.min.", "Goods.and.Services..."),
new=c("FoodCosts", "iPhone4S", "ClothingIndex", "Hours", "NetWage", "VacationDays",
"BigMac", "Bread", "Rice", "GoodsServices"))
price_data[,-1] %>% plot_ly(type = 'parcoords',
dimensions = list(
list(label = "Food Costs", values = ~FoodCosts),
list(label = "iPhone 4S", values = ~iPhone4S),
list(label = "Clothing Index", values = ~ClothingIndex),
list(label = "Hours Worked", values = ~Hours),
list(label = "Wage Net", values = ~NetWage),
list(label = "Vacation Days", values = ~VacationDays),
list(label = "Big Mac", values = ~BigMac),
list(label = "Bread", values = ~Bread),
list(label = "Rice", values = ~Rice),
list(label = "Goods Services", values = ~GoodsServices)))
#coloring lines
data=price_data[,-1]
data$BigMac_cluster=ifelse(data$BigMac <= 21, 1, 0)
obj <- ggparcoord(data, scale="uniminmax", groupColumn = "BigMac_cluster")
ggplotly(obj)
Analysis: Using iPhone4S, BigMac, Netwage and Foodcost clusters can be formed. We can see that split on the variable BigMac, there are two distinct clusters, one where BigMac < 21 and BigMac > 21. There appears to be no outlier considering the variables that we have formed the cluster on (BigMac, NetWage, FoodCost, iPhone4S)
reordmatr <- as.data.frame(reordmatr)
reordmatr$City = rownames(reordmatr)
reordmatr_transformed <- reordmatr%>%tidyr::gather(variable, value, -City, factor_key=T)%>%arrange(City)
first_col_order <- rownames(reordmatr)
second_col_order <- unique(reordmatr_transformed$variable)
reordmatr_transformed$City <- factor(reordmatr_transformed$City , levels = first_col_order)
reordmatr_transformed$variable <- factor(reordmatr_transformed$variable , levels = second_col_order)
reordmatr_transformed <- reordmatr_transformed[order(reordmatr_transformed$City, reordmatr_transformed$variable),]
radar_plot <- reordmatr_transformed %>% ggplot(aes(x=variable, y=value, group=City)) + geom_polygon(fill="blue") + coord_polar() + theme_bw() + facet_wrap(~ City) + theme(axis.text.x = element_text(size = 5))
ggsave("radar_plot.png", width = 40, height = 60, units = "cm")
Analysis: Berlin, Bejing and Bogoat is one of the cluster. Caracas, Copenhagen and Chicago is the second cluster. The outliers are the following: Bogoat in first cluster and Copenhagen in second cluster.
Analysis: Radar plot was the most effective. Its easy to see clusters and pattern and compare two very classes very well as well as picking outliers.
knitr::opts_chunk$set(echo = TRUE)
adult_data <- read.csv("adult.csv", header = FALSE)
colnames(adult_data) <- c("age", "workclass", "fnlwgt", "educationType", "education", "marital", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hours", "nativeCountry", "incomeLevel")
# Scatter Plot
ggplot(data = adult_data, aes(x=age, y=hours, colour = incomeLevel)) + geom_point() + ggtitle("Hours per Week vs. Age")
# Trellis Plot
ggplot(data = adult_data, aes(x=age, y=hours, color = incomeLevel)) + geom_point()+ facet_grid(incomeLevel~.) + ggtitle("Hours per Week vs. Age")
Analysis: People earning less than 50K tend to work for longer hours and longer age. This information was difficult to see in the scatter plot
ggplot(adult_data, aes(age, colour = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + ggtitle("Denisty Plot of Age")
ggplot(data = adult_data, aes(x=age, color = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + facet_grid(marital~.) + ggtitle("Trellis Plot of Age vs. Marital Status")
Analysis: The Mode of ‘Never-mind’ of people earning (<=50K) is the lowest among all classes. The difference in age between <=50K and >50K is largest in ‘Never-mind’. The highest amount of widowed are around age 65, and there is no difference between the IncomeLevel.
adult_data %>% filter(capitalLoss > 0) %>% plot_ly(x = ~education, y = ~age, z = ~capitalLoss) %>% add_markers() %>% layout(scene = list(xaxis = list(title = 'Education'), yaxis = list(title = 'Age'), zaxis = list(title = 'CapitalLoss')))
adult_data$age_discreet <- cut_number(adult_data$age, 6)
raster_3d <- adult_data %>% filter(capitalLoss > 0) %>% ggplot(aes(x=education, y=capitalLoss)) + stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) + facet_grid(age_discreet~.) + ggtitle("Trellis Plot of Captail Loss vs. Education")
ggsave("raster_3d.png", width = 40, height = 60, units = "cm")
Analysis: The 3D scatter plot is difficult to analyse due to density of data around similar values. In the raster type 2D density we definetly see outliers/clusters, which is the most prominent in the age group of 17-24, around captailloss of 1500.
# Trellis Plot
adult_data$age_discreet <- cut_number(adult_data$age, 4)
ggplot(data = adult_data, aes(x=education, y=capitalLoss, color = capitalLoss)) +
geom_point() + facet_grid(age_discreet~.) +
ggtitle("Education vs. Capital Loss")
Agerange <-lattice::equal.count(adult_data$age, number=4, overlap=0.10) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(adult_data$age>=L1$Lower[i] &adult_data$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-adult_data[index,]
df4$Class<-as.factor(Class)
ggplot(data = df4, aes(x=education, y=capitalLoss, color = capitalLoss)) +
geom_point() + facet_grid(Class~.) +
ggtitle("Education vs. Capital Loss using Shingles")
Analysis: In this case contribution by Shingles seems to be not too great, the advatage of shingles compared to simple split would be that sudden jumps in data would be missed if it occured in the transition period (change of quaters). The disadvatge is that the onus of choosing the overlap falls on the designer/plotter, otherwise there would be redudunacy of data.
knitr::opts_chunk$set(echo = FALSE)
library(data.table)
library(dplyr)
library(scales)
library(plotly)
library(ggplot2)
library(seriation)
library(GGally)
knitr::opts_chunk$set(echo = TRUE)
price_data <- read.delim("prices-and-earnings.txt")
price_data <- price_data[,c(1,2,5,6,7,9,10,16,17,18,19)]
rownames(price_data) <- price_data$City
price_data_scale <- scale(price_data[,-1])
rownames(price_data_scale) <- price_data$City
plot_ly(x=colnames(price_data_scale), y=rownames(price_data_scale),
z=price_data_scale, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset")
price_row_dist <- dist(x=price_data_scale, method = "euclidean", diag = TRUE)
price_col_dist <- dist(x=t(price_data_scale), method = "euclidean", diag = TRUE)
price_row_cor <- cor(x=t(price_data_scale))
price_col_cor <- cor(x=price_data_scale)
price_row_cor <- 1- price_row_cor
price_col_cor <- 1- price_col_cor
order1 <- seriate(price_row_dist, "OLO")
order2 <- seriate(price_col_dist, "OLO")
ord1 <- get_order(order1)
ord2 <- get_order(order2)
order1_cor <- seriate(as.dist(price_row_cor), "OLO")
order2_cor <- seriate(as.dist(price_col_cor), "OLO")
ord1_cor <- get_order(order1_cor)
ord2_cor <- get_order(order2_cor)
reordmatr <- price_data_scale[rev(ord1),ord2]
reordmatr_cor <- price_data_scale[rev(ord1_cor),ord2_cor]
plot_ly(x=colnames(reordmatr), y=rownames(reordmatr),
z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Optimal leaf ordering")
plot_ly(x=colnames(reordmatr_cor), y=rownames(reordmatr_cor),
z=reordmatr_cor, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of correlations of price dataset")
order1_tsp <- seriate(price_row_dist, "TSP")
order2_tsp <- seriate(price_col_dist, "TSP")
ord1_tsp <- get_order(order1_tsp)
ord2_tsp <- get_order(order2_tsp)
reordmatr <- price_data_scale[rev(ord1_tsp),ord2_tsp]
plot_ly(x=colnames(reordmatr), y=rownames(reordmatr),
z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Traveling Sales Man")
## Comparing the optimization
criterion(dist(price_data_scale[rev(ord1_tsp)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
criterion(dist(price_data_scale[rev(ord1)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
setnames(price_data, old = c("Food.Costs...", "iPhone.4S.hr.", "Clothing.Index", "Hours.Worked",
"Wage.Net", "Vacation.Days", "Big.Mac.min.", "Bread.kg.in.min.",
"Rice.kg.in.min.", "Goods.and.Services..."),
new=c("FoodCosts", "iPhone4S", "ClothingIndex", "Hours", "NetWage", "VacationDays",
"BigMac", "Bread", "Rice", "GoodsServices"))
price_data[,-1] %>% plot_ly(type = 'parcoords',
dimensions = list(
list(label = "Food Costs", values = ~FoodCosts),
list(label = "iPhone 4S", values = ~iPhone4S),
list(label = "Clothing Index", values = ~ClothingIndex),
list(label = "Hours Worked", values = ~Hours),
list(label = "Wage Net", values = ~NetWage),
list(label = "Vacation Days", values = ~VacationDays),
list(label = "Big Mac", values = ~BigMac),
list(label = "Bread", values = ~Bread),
list(label = "Rice", values = ~Rice),
list(label = "Goods Services", values = ~GoodsServices)))
#coloring lines
data=price_data[,-1]
data$BigMac_cluster=ifelse(data$BigMac <= 21, 1, 0)
obj <- ggparcoord(data, scale="uniminmax", groupColumn = "BigMac_cluster")
ggplotly(obj)
reordmatr <- as.data.frame(reordmatr)
reordmatr$City = rownames(reordmatr)
reordmatr_transformed <- reordmatr%>%tidyr::gather(variable, value, -City, factor_key=T)%>%arrange(City)
first_col_order <- rownames(reordmatr)
second_col_order <- unique(reordmatr_transformed$variable)
reordmatr_transformed$City <- factor(reordmatr_transformed$City , levels = first_col_order)
reordmatr_transformed$variable <- factor(reordmatr_transformed$variable , levels = second_col_order)
reordmatr_transformed <- reordmatr_transformed[order(reordmatr_transformed$City, reordmatr_transformed$variable),]
radar_plot <- reordmatr_transformed %>% ggplot(aes(x=variable, y=value, group=City)) + geom_polygon(fill="blue") + coord_polar() + theme_bw() + facet_wrap(~ City) + theme(axis.text.x = element_text(size = 5))
ggsave("radar_plot.png", width = 40, height = 60, units = "cm")
knitr::include_graphics("radar_plot.png")
knitr::opts_chunk$set(echo = TRUE)
adult_data <- read.csv("adult.csv", header = FALSE)
colnames(adult_data) <- c("age", "workclass", "fnlwgt", "educationType", "education", "marital", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hours", "nativeCountry", "incomeLevel")
# Scatter Plot
ggplot(data = adult_data, aes(x=age, y=hours, colour = incomeLevel)) + geom_point() + ggtitle("Hours per Week vs. Age")
# Trellis Plot
ggplot(data = adult_data, aes(x=age, y=hours, color = incomeLevel)) + geom_point()+ facet_grid(incomeLevel~.) + ggtitle("Hours per Week vs. Age")
ggplot(adult_data, aes(age, colour = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + ggtitle("Denisty Plot of Age")
ggplot(data = adult_data, aes(x=age, color = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + facet_grid(marital~.) + ggtitle("Trellis Plot of Age vs. Marital Status")
adult_data %>% filter(capitalLoss > 0) %>% plot_ly(x = ~education, y = ~age, z = ~capitalLoss) %>% add_markers() %>% layout(scene = list(xaxis = list(title = 'Education'), yaxis = list(title = 'Age'), zaxis = list(title = 'CapitalLoss')))
adult_data$age_discreet <- cut_number(adult_data$age, 6)
raster_3d <- adult_data %>% filter(capitalLoss > 0) %>% ggplot(aes(x=education, y=capitalLoss)) + stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) + facet_grid(age_discreet~.) + ggtitle("Trellis Plot of Captail Loss vs. Education")
ggsave("raster_3d.png", width = 40, height = 60, units = "cm")
knitr::include_graphics("raster_3d.png")
# Trellis Plot
adult_data$age_discreet <- cut_number(adult_data$age, 4)
ggplot(data = adult_data, aes(x=education, y=capitalLoss, color = capitalLoss)) +
geom_point() + facet_grid(age_discreet~.) +
ggtitle("Education vs. Capital Loss")
Agerange <-lattice::equal.count(adult_data$age, number=4, overlap=0.10) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(adult_data$age>=L1$Lower[i] &adult_data$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-adult_data[index,]
df4$Class<-as.factor(Class)
ggplot(data = df4, aes(x=education, y=capitalLoss, color = capitalLoss)) +
geom_point() + facet_grid(Class~.) +
ggtitle("Education vs. Capital Loss using Shingles")